The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas
Thera bank wants a classification model that will help the bank improve its services so that customers do not renounce their credit cards
We need to identify the best possible model that will give the required performance
Explore and visualize the dataset.
Build a classification model to predict if the customer is going to churn or not.
Optimize the model using appropriate techniques.
Generate a set of insights and recommendations that will help the bank.
This dataset contains the information of the Thera Bank's customer data
import warnings
warnings.filterwarnings("ignore")
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# Library to split data
from sklearn.model_selection import train_test_split
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Removes the limit from the number of displayed columns and rows.
# This is so I can see the entire dataframe when I print it
pd.set_option("display.max_columns", None)
# pd.set_option('display.max_rows', None)
pd.set_option("display.max_rows", 200)
# To build linear model for statistical analysis and prediction
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
# To tune the model and cross validation
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (
plot_confusion_matrix,
)
# To impute missing values
from sklearn.impute import KNNImputer,SimpleImputer
# To build sklearn model
from sklearn.linear_model import LogisticRegression
# To get different metric scores
from sklearn import metrics
from sklearn.metrics import f1_score,accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, confusion_matrix, precision_recall_curve
# To build Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# To build bagging classifier and Random Forest model
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To build boosting models
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
#To install xgboost library use - !pip install xgboost
from xgboost import XGBClassifier
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
#Import the data source csv file as a data frame
data = pd.read_csv('BankChurners.csv')
#Make a copy to avoid any changes to the original data
Customer_data = data.copy()
#Print the first five rows of the dataset
print(Customer_data.head())
#Print the last five rows of the dataset
print(Customer_data.tail())
CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \
0 768805383 Existing Customer 45 M 3
1 818770008 Existing Customer 49 F 5
2 713982108 Existing Customer 51 M 3
3 769911858 Existing Customer 40 F 4
4 709106358 Existing Customer 40 M 3
Education_Level Marital_Status Income_Category Card_Category \
0 High School Married $60K - $80K Blue
1 Graduate Single Less than $40K Blue
2 Graduate Married $80K - $120K Blue
3 High School NaN Less than $40K Blue
4 Uneducated Married $60K - $80K Blue
Months_on_book Total_Relationship_Count Months_Inactive_12_mon \
0 39 5 1
1 44 6 1
2 36 4 1
3 34 3 4
4 21 5 1
Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \
0 3 12691.0 777 11914.0
1 2 8256.0 864 7392.0
2 0 3418.0 0 3418.0
3 1 3313.0 2517 796.0
4 0 4716.0 0 4716.0
Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 \
0 1.335 1144 42 1.625
1 1.541 1291 33 3.714
2 2.594 1887 20 2.333
3 1.405 1171 20 2.333
4 2.175 816 28 2.500
Avg_Utilization_Ratio
0 0.061
1 0.105
2 0.000
3 0.760
4 0.000
CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \
10122 772366833 Existing Customer 50 M 2
10123 710638233 Attrited Customer 41 M 2
10124 716506083 Attrited Customer 44 F 1
10125 717406983 Attrited Customer 30 M 2
10126 714337233 Attrited Customer 43 F 2
Education_Level Marital_Status Income_Category Card_Category \
10122 Graduate Single $40K - $60K Blue
10123 NaN Divorced $40K - $60K Blue
10124 High School Married Less than $40K Blue
10125 Graduate NaN $40K - $60K Blue
10126 Graduate Married Less than $40K Silver
Months_on_book Total_Relationship_Count Months_Inactive_12_mon \
10122 40 3 2
10123 25 4 2
10124 36 5 3
10125 36 4 3
10126 25 6 2
Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal \
10122 3 4003.0 1851
10123 3 4277.0 2186
10124 4 5409.0 0
10125 3 5281.0 0
10126 4 10388.0 1961
Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \
10122 2152.0 0.703 15476 117
10123 2091.0 0.804 8764 69
10124 5409.0 0.819 10291 60
10125 5281.0 0.535 8395 62
10126 8427.0 0.703 10294 61
Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
10122 0.857 0.462
10123 0.683 0.511
10124 0.818 0.000
10125 0.722 0.000
10126 0.649 0.189
#Print the number of rows and columns in dataset
print (Customer_data.shape)
(10127, 21)
#Check for null values and duplicates
print (Customer_data.isna().sum())
print(Customer_data.duplicated().sum())
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 0
#Check the column datatypes
print(Customer_data.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB None
We see that CLIENTNUM column is customer id does not have any statistical values and hence we will be dropping that column as a part of clean up.
Dependent variable is the Attrition_Flag which is of object data type and will be label encoded and conveted to numeric data type for modelling.
All the variables with object data type will be converted to categorical data types soon.
There are missing values in the column Education_Level and Marital_Status in the dataset.
#Check the summary of dataset
print(Customer_data.describe())
CLIENTNUM Customer_Age Dependent_count Months_on_book \
count 1.012700e+04 10127.000000 10127.000000 10127.000000
mean 7.391776e+08 46.325960 2.346203 35.928409
std 3.690378e+07 8.016814 1.298908 7.986416
min 7.080821e+08 26.000000 0.000000 13.000000
25% 7.130368e+08 41.000000 1.000000 31.000000
50% 7.179264e+08 46.000000 2.000000 36.000000
75% 7.731435e+08 52.000000 3.000000 40.000000
max 8.283431e+08 73.000000 5.000000 56.000000
Total_Relationship_Count Months_Inactive_12_mon \
count 10127.000000 10127.000000
mean 3.812580 2.341167
std 1.554408 1.010622
min 1.000000 0.000000
25% 3.000000 2.000000
50% 4.000000 2.000000
75% 5.000000 3.000000
max 6.000000 6.000000
Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal \
count 10127.000000 10127.000000 10127.000000
mean 2.455317 8631.953698 1162.814061
std 1.106225 9088.776650 814.987335
min 0.000000 1438.300000 0.000000
25% 2.000000 2555.000000 359.000000
50% 2.000000 4549.000000 1276.000000
75% 3.000000 11067.500000 1784.000000
max 6.000000 34516.000000 2517.000000
Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \
count 10127.000000 10127.000000 10127.000000 10127.000000
mean 7469.139637 0.759941 4404.086304 64.858695
std 9090.685324 0.219207 3397.129254 23.472570
min 3.000000 0.000000 510.000000 10.000000
25% 1324.500000 0.631000 2155.500000 45.000000
50% 3474.000000 0.736000 3899.000000 67.000000
75% 9859.000000 0.859000 4741.000000 81.000000
max 34516.000000 3.397000 18484.000000 139.000000
Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
count 10127.000000 10127.000000
mean 0.712222 0.274894
std 0.238086 0.275691
min 0.000000 0.000000
25% 0.582000 0.023000
50% 0.702000 0.176000
75% 0.818000 0.503000
max 3.714000 0.999000
CLIENTNUM: It is just a mere customer ID number and will not add any statistical value to our Analysis.
Customer_Age: Average age of people in the dataset is 46 years, age has a wide range from 26 to 73 years.
Dependent_count: The average dependent count is 2. There are single customers as well with no dependents and customers with a max dependent count of 5.
Months_on_book: The average customers credit card account age is 36 months.The mean and median are almost equal to 36 indicating symmetry in data.However the min and 25% is quite apart and is also the case with 75% percentile and maximum hinting extreme values on both the tails.
Total_Relationship_count: The mean of total relationship count is 3.
Months_Inactive_12_mon: On average the customers accounts are inactive for 2 -2.5 months in a year.
Contacts_Count_12_mon: The average number of times the contacts the bank is 2 and most of the customers in the dataset had contacted 3 times as in 75% percentile
Credit_Limit: On average customers credit limit is around 9000 dollars.A vast difference in the 75th(11000 dollars)percentile and the maximum value(35000 dollars), indicates that there might be outliers present in the variable.
Total_Revolving_Bal: The average revolving balance for customers is 1100 dollars. 25% either use the card very limited or pay back the balances on time with a low balance carry forward.
Avg_Open_To_Buy:The average available credit to use in the card for customers is 7500 dollars. A vast difference in the 75th(10000 dollars)percentile and the maximum value(35000 dollars), indicates that there might be outliers present in the variable.
Total_Amt_Chng_Q4_Q1: The mean of the ratio of transaction amount in 1st to 4th quarter is 0.759941
Total_Trans_Amt:The mean of the transaction amount spent by the customer is 4500 dollars.The vast difference between 75%(5000 dollars) percentile and maximum value(18000 dollars) indicate the presence of outliers,
Total_Trans_Ct:The mean of the number of trannsactions made by the customer is 65.The vast difference between 75% percentile and maximum value indicate the presence of outliers,
Total_Ct_Chng_Q4_Q1:The mean of the ratio of transaction count in 1st to 4th quarter is 0.712222
Avg_Utilization_Ratio:The average utilizataion ration is 0.27.However for 50% of the customers it is really low at 17% indicating more than half of the customers in the datset do not use the credit cards much.
# Assigning the dataframe columns to a variable
num_columns = Customer_data.describe(include = 'all').columns
num_columns
Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
'Dependent_count', 'Education_Level', 'Marital_Status',
'Income_Category', 'Card_Category', 'Months_on_book',
'Total_Relationship_Count', 'Months_Inactive_12_mon',
'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
dtype='object')
for i in num_columns:
print('Unique values in',i, 'are :')
print(Customer_data[i].value_counts())
print('*'*50)
Unique values in CLIENTNUM are :
780097533 1
720049083 1
717376758 1
720598308 1
719930658 1
..
818987958 1
808753758 1
789347133 1
806189658 1
713594883 1
Name: CLIENTNUM, Length: 10127, dtype: int64
**************************************************
Unique values in Attrition_Flag are :
Existing Customer 8500
Attrited Customer 1627
Name: Attrition_Flag, dtype: int64
**************************************************
Unique values in Customer_Age are :
44 500
49 495
46 490
45 486
47 479
43 473
48 472
50 452
42 426
51 398
53 387
41 379
52 376
40 361
39 333
54 307
38 303
55 279
56 262
37 260
57 223
36 221
35 184
59 157
58 157
34 146
60 127
33 127
32 106
65 101
61 93
62 93
31 91
26 78
30 70
63 65
29 56
64 43
27 32
28 29
67 4
66 2
68 2
73 1
70 1
Name: Customer_Age, dtype: int64
**************************************************
Unique values in Gender are :
F 5358
M 4769
Name: Gender, dtype: int64
**************************************************
Unique values in Dependent_count are :
3 2732
2 2655
1 1838
4 1574
0 904
5 424
Name: Dependent_count, dtype: int64
**************************************************
Unique values in Education_Level are :
Graduate 3128
High School 2013
Uneducated 1487
College 1013
Post-Graduate 516
Doctorate 451
Name: Education_Level, dtype: int64
**************************************************
Unique values in Marital_Status are :
Married 4687
Single 3943
Divorced 748
Name: Marital_Status, dtype: int64
**************************************************
Unique values in Income_Category are :
Less than $40K 3561
$40K - $60K 1790
$80K - $120K 1535
$60K - $80K 1402
abc 1112
$120K + 727
Name: Income_Category, dtype: int64
**************************************************
Unique values in Card_Category are :
Blue 9436
Silver 555
Gold 116
Platinum 20
Name: Card_Category, dtype: int64
**************************************************
Unique values in Months_on_book are :
36 2463
37 358
34 353
38 347
39 341
40 333
31 318
35 317
33 305
30 300
41 297
32 289
28 275
43 273
42 271
29 241
44 230
45 227
27 206
46 197
26 186
47 171
25 165
48 162
24 160
49 141
23 116
22 105
56 103
50 96
21 83
51 80
53 78
20 74
13 70
19 63
52 62
18 58
54 53
55 42
17 39
15 34
16 29
14 16
Name: Months_on_book, dtype: int64
**************************************************
Unique values in Total_Relationship_Count are :
3 2305
4 1912
5 1891
6 1866
2 1243
1 910
Name: Total_Relationship_Count, dtype: int64
**************************************************
Unique values in Months_Inactive_12_mon are :
3 3846
2 3282
1 2233
4 435
5 178
6 124
0 29
Name: Months_Inactive_12_mon, dtype: int64
**************************************************
Unique values in Contacts_Count_12_mon are :
3 3380
2 3227
1 1499
4 1392
0 399
5 176
6 54
Name: Contacts_Count_12_mon, dtype: int64
**************************************************
Unique values in Credit_Limit are :
34516.0 508
1438.3 507
15987.0 18
9959.0 18
23981.0 12
...
3891.0 1
19354.0 1
34427.0 1
14527.0 1
8206.0 1
Name: Credit_Limit, Length: 6205, dtype: int64
**************************************************
Unique values in Total_Revolving_Bal are :
0 2470
2517 508
1965 12
1480 12
1720 11
...
1401 1
2293 1
2172 1
2040 1
204 1
Name: Total_Revolving_Bal, Length: 1974, dtype: int64
**************************************************
Unique values in Avg_Open_To_Buy are :
1438.3 324
34516.0 98
31999.0 26
787.0 8
953.0 7
...
2214.0 1
4458.0 1
8013.0 1
33398.0 1
4117.0 1
Name: Avg_Open_To_Buy, Length: 6813, dtype: int64
**************************************************
Unique values in Total_Amt_Chng_Q4_Q1 are :
0.791 36
0.743 34
0.712 34
0.735 33
0.718 33
..
0.330 1
1.252 1
2.271 1
0.175 1
1.750 1
Name: Total_Amt_Chng_Q4_Q1, Length: 1158, dtype: int64
**************************************************
Unique values in Total_Trans_Amt are :
4253 11
4509 11
2229 10
4518 10
4869 9
..
1810 1
8257 1
14402 1
2174 1
10294 1
Name: Total_Trans_Amt, Length: 5033, dtype: int64
**************************************************
Unique values in Total_Trans_Ct are :
81 208
75 203
71 203
82 202
69 202
76 198
77 197
70 193
78 190
74 190
67 186
79 184
73 183
80 173
68 170
83 169
72 168
65 166
66 164
64 158
63 150
85 148
43 147
84 147
37 141
38 139
41 138
87 137
35 136
40 136
36 135
62 134
86 133
42 132
45 129
44 127
39 126
49 118
61 118
33 116
88 114
60 111
47 110
34 107
56 106
32 104
58 103
46 100
31 100
48 98
59 97
57 94
89 93
51 92
50 91
54 89
53 85
30 84
90 83
27 82
55 78
29 75
28 73
92 66
52 64
91 62
25 57
26 56
93 55
94 51
24 50
96 44
97 42
95 40
99 38
100 38
22 35
23 34
21 33
116 32
105 32
104 31
106 31
98 31
120 31
103 31
102 30
124 28
101 25
115 25
110 25
112 24
114 23
18 23
113 23
111 22
121 22
118 22
109 22
117 21
108 21
20 19
122 18
15 16
119 16
123 15
107 14
17 13
16 13
127 12
125 12
19 11
128 10
126 10
14 9
129 6
131 6
130 5
13 5
10 4
12 4
11 2
138 1
139 1
132 1
134 1
Name: Total_Trans_Ct, dtype: int64
**************************************************
Unique values in Total_Ct_Chng_Q4_Q1 are :
0.667 171
1.000 166
0.500 161
0.750 156
0.600 113
...
1.476 1
0.119 1
1.161 1
1.093 1
1.533 1
Name: Total_Ct_Chng_Q4_Q1, Length: 830, dtype: int64
**************************************************
Unique values in Avg_Utilization_Ratio are :
0.000 2470
0.073 44
0.057 33
0.048 32
0.060 30
...
0.929 1
0.874 1
0.995 1
0.011 1
0.985 1
Name: Avg_Utilization_Ratio, Length: 964, dtype: int64
**************************************************
# Client_id is unique for each customer and might not add value to modeling
Customer_data.drop(["CLIENTNUM"], axis=1, inplace=True)
#Lets replace the values in Attrition_Flag column with numeric values 0 and 1
# 1 = Attrited Custmer, 0 = Existing Customer
Customer_data.Attrition_Flag = Customer_data.Attrition_Flag.apply(lambda x: 0 if x == 'Existing Customer' else x)
Customer_data.Attrition_Flag = Customer_data.Attrition_Flag.apply(lambda x: 1 if x == 'Attrited Customer' else x)
Customer_data.tail()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 0 | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | 2 | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 1 | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | 4 | 2 | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 1 | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | 3 | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 1 | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | 4 | 3 | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 1 | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | 2 | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
#Group the Age into AgeRange bucket by adding a new column to the dataframe
Customer_data['Customer_AgeRange'] = pd.cut(x = Customer_data['Customer_Age'],bins = [20,30,40,50,60,70,80])
Customer_data['Months_on_bookRange'] = pd.cut(x = Customer_data['Months_on_book'],bins = [12,24,36,48,60])
Customer_data['Credit_LimitRange'] = pd.cut(x = Customer_data['Credit_Limit'],bins = [1000,5000,10000,15000,20000,25000,30000,35000])
Customer_data['Total_Revolving_BalRange'] = pd.cut(x = Customer_data['Total_Revolving_Bal'],bins = [-1,500,1000,1500,2000,2500,3000])
Customer_data['Avg_Open_To_BuyRange'] = pd.cut(x = Customer_data['Avg_Open_To_Buy'],bins = [2,5000,10000,15000,20000,25000,30000,35000])
Customer_data['Total_Trans_AmtRange'] = pd.cut(x = Customer_data['Total_Trans_Amt'],bins = [500,1000,5000,10000,15000,20000])
Customer_data['Total_Trans_CtRange'] = pd.cut(x = Customer_data['Total_Trans_Ct'],bins = [9,50,100,150])
Customer_data['Total_Ct_Chng_Q4_Q1Range'] = pd.cut(x = Customer_data['Total_Ct_Chng_Q4_Q1'],bins = [-0.1,0.5,1,1.5,2,2.5,3,3.5,4])
Customer_data['Total_Amt_Chng_Q4_Q1Range'] = pd.cut(x = Customer_data['Total_Amt_Chng_Q4_Q1'],bins = [-0.1,0.5,1,1.5,2,2.5,3,3.5,4])
Customer_data['Avg_Utilization_RatioRange'] = pd.cut(x = Customer_data['Avg_Utilization_Ratio'],bins = [-0.1,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
print(Customer_data.head())
print(Customer_data.head())
Attrition_Flag Customer_Age Gender Dependent_count Education_Level \ 0 0 45 M 3 High School 1 0 49 F 5 Graduate 2 0 51 M 3 Graduate 3 0 40 F 4 High School 4 0 40 M 3 Uneducated Marital_Status Income_Category Card_Category Months_on_book \ 0 Married $60K - $80K Blue 39 1 Single Less than $40K Blue 44 2 Married $80K - $120K Blue 36 3 NaN Less than $40K Blue 34 4 Married $60K - $80K Blue 21 Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon \ 0 5 1 3 1 6 1 2 2 4 1 0 3 3 4 1 4 5 1 0 Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 \ 0 12691.0 777 11914.0 1.335 1 8256.0 864 7392.0 1.541 2 3418.0 0 3418.0 2.594 3 3313.0 2517 796.0 1.405 4 4716.0 0 4716.0 2.175 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 \ 0 1144 42 1.625 1 1291 33 3.714 2 1887 20 2.333 3 1171 20 2.333 4 816 28 2.500 Avg_Utilization_Ratio Customer_AgeRange Months_on_bookRange \ 0 0.061 (40, 50] (36, 48] 1 0.105 (40, 50] (36, 48] 2 0.000 (50, 60] (24, 36] 3 0.760 (30, 40] (24, 36] 4 0.000 (30, 40] (12, 24] Credit_LimitRange Total_Revolving_BalRange Avg_Open_To_BuyRange \ 0 (10000, 15000] (500, 1000] (10000, 15000] 1 (5000, 10000] (500, 1000] (5000, 10000] 2 (1000, 5000] (-1, 500] (2, 5000] 3 (1000, 5000] (2500, 3000] (2, 5000] 4 (1000, 5000] (-1, 500] (2, 5000] Total_Trans_AmtRange Total_Trans_CtRange Total_Ct_Chng_Q4_Q1Range \ 0 (1000, 5000] (9, 50] (1.5, 2.0] 1 (1000, 5000] (9, 50] (3.5, 4.0] 2 (1000, 5000] (9, 50] (2.0, 2.5] 3 (1000, 5000] (9, 50] (2.0, 2.5] 4 (500, 1000] (9, 50] (2.0, 2.5] Total_Amt_Chng_Q4_Q1Range Avg_Utilization_RatioRange 0 (1.0, 1.5] (-0.1, 0.1] 1 (1.5, 2.0] (0.1, 0.2] 2 (2.5, 3.0] (-0.1, 0.1] 3 (1.0, 1.5] (0.7, 0.8] 4 (2.0, 2.5] (-0.1, 0.1] Attrition_Flag Customer_Age Gender Dependent_count Education_Level \ 0 0 45 M 3 High School 1 0 49 F 5 Graduate 2 0 51 M 3 Graduate 3 0 40 F 4 High School 4 0 40 M 3 Uneducated Marital_Status Income_Category Card_Category Months_on_book \ 0 Married $60K - $80K Blue 39 1 Single Less than $40K Blue 44 2 Married $80K - $120K Blue 36 3 NaN Less than $40K Blue 34 4 Married $60K - $80K Blue 21 Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon \ 0 5 1 3 1 6 1 2 2 4 1 0 3 3 4 1 4 5 1 0 Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 \ 0 12691.0 777 11914.0 1.335 1 8256.0 864 7392.0 1.541 2 3418.0 0 3418.0 2.594 3 3313.0 2517 796.0 1.405 4 4716.0 0 4716.0 2.175 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 \ 0 1144 42 1.625 1 1291 33 3.714 2 1887 20 2.333 3 1171 20 2.333 4 816 28 2.500 Avg_Utilization_Ratio Customer_AgeRange Months_on_bookRange \ 0 0.061 (40, 50] (36, 48] 1 0.105 (40, 50] (36, 48] 2 0.000 (50, 60] (24, 36] 3 0.760 (30, 40] (24, 36] 4 0.000 (30, 40] (12, 24] Credit_LimitRange Total_Revolving_BalRange Avg_Open_To_BuyRange \ 0 (10000, 15000] (500, 1000] (10000, 15000] 1 (5000, 10000] (500, 1000] (5000, 10000] 2 (1000, 5000] (-1, 500] (2, 5000] 3 (1000, 5000] (2500, 3000] (2, 5000] 4 (1000, 5000] (-1, 500] (2, 5000] Total_Trans_AmtRange Total_Trans_CtRange Total_Ct_Chng_Q4_Q1Range \ 0 (1000, 5000] (9, 50] (1.5, 2.0] 1 (1000, 5000] (9, 50] (3.5, 4.0] 2 (1000, 5000] (9, 50] (2.0, 2.5] 3 (1000, 5000] (9, 50] (2.0, 2.5] 4 (500, 1000] (9, 50] (2.0, 2.5] Total_Amt_Chng_Q4_Q1Range Avg_Utilization_RatioRange 0 (1.0, 1.5] (-0.1, 0.1] 1 (1.5, 2.0] (0.1, 0.2] 2 (2.5, 3.0] (-0.1, 0.1] 3 (1.0, 1.5] (0.7, 0.8] 4 (2.0, 2.5] (-0.1, 0.1]
#Convert the data types of the variables
Customer_data['Attrition_Flag'] = Customer_data['Attrition_Flag'].astype('int64')
Customer_data['Gender'] = Customer_data['Gender'].astype('category')
Customer_data['Education_Level'] = Customer_data['Education_Level'].astype('category')
Customer_data['Marital_Status'] = Customer_data['Marital_Status'].astype('category')
Customer_data['Income_Category'] = Customer_data['Income_Category'].astype('category')
Customer_data['Card_Category'] = Customer_data['Card_Category'].astype('category')
Customer_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 10127 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 20 Customer_AgeRange 10127 non-null category 21 Months_on_bookRange 10127 non-null category 22 Credit_LimitRange 10127 non-null category 23 Total_Revolving_BalRange 10127 non-null category 24 Avg_Open_To_BuyRange 10127 non-null category 25 Total_Trans_AmtRange 10127 non-null category 26 Total_Trans_CtRange 10127 non-null category 27 Total_Ct_Chng_Q4_Q1Range 10127 non-null category 28 Total_Amt_Chng_Q4_Q1Range 10127 non-null category 29 Avg_Utilization_RatioRange 10127 non-null category dtypes: category(15), float64(5), int64(10) memory usage: 1.3 MB
# lets plot histogram of all numerical variables
all_col = Customer_data.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(17, 75))
for i in range(len(all_col)):
plt.subplot(18, 3, i + 1)
sns.histplot(Customer_data[all_col[i]], kde=True)
plt.tight_layout()
plt.title(all_col[i], fontsize=25)
plt.show()
Attrition_Flag: It is our dependent variable.The data is imbalanced with very less attrited customers 20% and 80% existing customers.
Customer_Age: Average age of people in the dataset is 46 years, age has a wide range from 26 to 73 years.
Months_on_book: The average customers credit card account age is 36 months.The mean and median are almost equal to 36 indicating symmetry in data.
Total_Relationship_count: The mean of total relationship count is 3.
Months_Inactive_12_mon: On average the customers accounts are inactive for 2 -2.5 months in a year.
Contacts_Count_12_mon: The average number of times the contacts the bank is 2 and most of the customers in the dataset had contacted 3 times as in 75% percentile
Credit_Limit: Data is heavily right skewed with lot of outliers on the right tail.
Total_Revolving_Bal: it is kind of uniform distribution with minimum values having a higher count indicating 25% the customers are not using the card and 7.5% of the customers are heavily using the card.
Avg_Open_To_Buy: Data is heavily right skewed with lot of outliers on the right tail.
Total_Amt_Chng_Q4_Q1: Data has a normal distribution with a lot of outliers on the right end.
Total_Trans_Amt: This distribution is multimodal with lots of peaks.
Total_Trans_Ct: This distribution is bimodal with two peaks.
Total_Ct_Chng_Q4_Q1: Data has a normal distribution with a lot of outliers on the right end.
Avg_Utilization_Ratio: Data is heavily right skewed with lot of outliers.
# While doing uni-variate analysis of numerical variables we want to study their central tendency
# and dispersion.
# Let us write a function that will help us create boxplot and histogram for any input numerical
# variable.
# This function takes the numerical column as the input and returns the boxplots
# and histograms for the variable.
def histogram_boxplot(feature, figsize=(10,5), bins = None):
""" Boxplot and histogram combined
feature: 1-d feature array
figsize: size of fig (default (9,8))
bins: number of bins (default None / auto)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2, # Number of rows of the subplot grid= 2
sharex = True, # x-axis will be shared among all subplots
gridspec_kw = {"height_ratios": (.25, .75)},
figsize = figsize
) # creating the 2 subplots
sns.boxplot(feature, ax=ax_box2, showmeans=True, color='yellow') # boxplot will be created and a star will indicate the mean value of the column
sns.distplot(feature, kde=F, ax=ax_hist2, bins=bins,color = 'cyan') if bins else sns.distplot(feature, kde=False, ax=ax_hist2,color='tab:red') # For histogram
ax_hist2.axvline(np.mean(feature), color='green', linestyle='--') # Add mean to the histogram
ax_hist2.axvline(np.median(feature), color='black', linestyle='-') # Add median to the histogram
histogram_boxplot(Customer_data.Customer_Age)
histogram_boxplot(Customer_data.Credit_Limit)
histogram_boxplot(Customer_data.Total_Revolving_Bal)
histogram_boxplot(Customer_data.Total_Amt_Chng_Q4_Q1)
histogram_boxplot(Customer_data.Total_Ct_Chng_Q4_Q1)
# Function to create barplots that indicate percentage for each category.
def perc_on_bar(z):
'''
plot
feature: categorical feature
the function won't work if a column is passed in hue parameter
'''
total = len(Customer_data[z]) # length of the column
plt.figure(figsize=(15,5))
#plt.xticks(rotation=45)
ax = sns.countplot(Customer_data[z],palette='Paired')
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
# x = p.get_x() + p.get_width() / 2 - 0.05 # width of the plot
# y = p.get_y() + p.get_height() # hieght of the plot
x = p.get_x() + p.get_width() / total + 0.2 # width of the plot
y = p.get_y() + p.get_height() # height of the plot
ax.annotate(percentage,(x, y), size = 10) # annotate the percantage
plt.xticks(rotation = 90)
plt.show() # show the plot
#Lets plot the percentage countplot for all categorical variables
perc_on_bar('Gender')
perc_on_bar('Dependent_count')
perc_on_bar('Education_Level')
perc_on_bar('Marital_Status')
perc_on_bar('Income_Category')
perc_on_bar('Card_Category')
perc_on_bar('Total_Relationship_Count')
perc_on_bar('Months_Inactive_12_mon')
perc_on_bar('Contacts_Count_12_mon')
plt.figure(figsize=(15, 7))
sns.heatmap(Customer_data.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
Customer_data.corr()
| Attrition_Flag | Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 1.000000 | 0.018203 | 0.018991 | 0.013687 | -0.150005 | 0.152449 | 0.204491 | -0.023873 | -0.263053 | -0.000285 | -0.131063 | -0.168598 | -0.371403 | -0.290054 | -0.178410 |
| Customer_Age | 0.018203 | 1.000000 | -0.122254 | 0.788912 | -0.010931 | 0.054361 | -0.018452 | 0.002476 | 0.014780 | 0.001151 | -0.062042 | -0.046446 | -0.067097 | -0.012143 | 0.007114 |
| Dependent_count | 0.018991 | -0.122254 | 1.000000 | -0.103062 | -0.039076 | -0.010768 | -0.040505 | 0.068065 | -0.002688 | 0.068291 | -0.035439 | 0.025046 | 0.049912 | 0.011087 | -0.037135 |
| Months_on_book | 0.013687 | 0.788912 | -0.103062 | 1.000000 | -0.009203 | 0.074164 | -0.010774 | 0.007507 | 0.008623 | 0.006732 | -0.048959 | -0.038591 | -0.049819 | -0.014072 | -0.007541 |
| Total_Relationship_Count | -0.150005 | -0.010931 | -0.039076 | -0.009203 | 1.000000 | -0.003675 | 0.055203 | -0.071386 | 0.013726 | -0.072601 | 0.050119 | -0.347229 | -0.241891 | 0.040831 | 0.067663 |
| Months_Inactive_12_mon | 0.152449 | 0.054361 | -0.010768 | 0.074164 | -0.003675 | 1.000000 | 0.029493 | -0.020394 | -0.042210 | -0.016605 | -0.032247 | -0.036982 | -0.042787 | -0.038989 | -0.007503 |
| Contacts_Count_12_mon | 0.204491 | -0.018452 | -0.040505 | -0.010774 | 0.055203 | 0.029493 | 1.000000 | 0.020817 | -0.053913 | 0.025646 | -0.024445 | -0.112774 | -0.152213 | -0.094997 | -0.055471 |
| Credit_Limit | -0.023873 | 0.002476 | 0.068065 | 0.007507 | -0.071386 | -0.020394 | 0.020817 | 1.000000 | 0.042493 | 0.995981 | 0.012813 | 0.171730 | 0.075927 | -0.002020 | -0.482965 |
| Total_Revolving_Bal | -0.263053 | 0.014780 | -0.002688 | 0.008623 | 0.013726 | -0.042210 | -0.053913 | 0.042493 | 1.000000 | -0.047167 | 0.058174 | 0.064370 | 0.056060 | 0.089861 | 0.624022 |
| Avg_Open_To_Buy | -0.000285 | 0.001151 | 0.068291 | 0.006732 | -0.072601 | -0.016605 | 0.025646 | 0.995981 | -0.047167 | 1.000000 | 0.007595 | 0.165923 | 0.070885 | -0.010076 | -0.538808 |
| Total_Amt_Chng_Q4_Q1 | -0.131063 | -0.062042 | -0.035439 | -0.048959 | 0.050119 | -0.032247 | -0.024445 | 0.012813 | 0.058174 | 0.007595 | 1.000000 | 0.039678 | 0.005469 | 0.384189 | 0.035235 |
| Total_Trans_Amt | -0.168598 | -0.046446 | 0.025046 | -0.038591 | -0.347229 | -0.036982 | -0.112774 | 0.171730 | 0.064370 | 0.165923 | 0.039678 | 1.000000 | 0.807192 | 0.085581 | -0.083034 |
| Total_Trans_Ct | -0.371403 | -0.067097 | 0.049912 | -0.049819 | -0.241891 | -0.042787 | -0.152213 | 0.075927 | 0.056060 | 0.070885 | 0.005469 | 0.807192 | 1.000000 | 0.112324 | 0.002838 |
| Total_Ct_Chng_Q4_Q1 | -0.290054 | -0.012143 | 0.011087 | -0.014072 | 0.040831 | -0.038989 | -0.094997 | -0.002020 | 0.089861 | -0.010076 | 0.384189 | 0.085581 | 0.112324 | 1.000000 | 0.074143 |
| Avg_Utilization_Ratio | -0.178410 | 0.007114 | -0.037135 | -0.007541 | 0.067663 | -0.007503 | -0.055471 | -0.482965 | 0.624022 | -0.538808 | 0.035235 | -0.083034 | 0.002838 | 0.074143 | 1.000000 |
sns.pairplot(data=Customer_data, diag_kind="kde", hue= "Attrition_Flag")
plt.show()
# function to plot stacked bar chart for every variable against the dependent variable
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
plt.legend(
loc="lower left",
frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stacked_barplot(Customer_data, "Customer_AgeRange", "Attrition_Flag")
Attrition_Flag 0 1 All Customer_AgeRange All 8500 1627 10127 (40, 50] 3873 779 4652 (50, 60] 2225 448 2673 (30, 40] 1822 310 2132 (60, 70] 346 58 404 (20, 30] 233 32 265 (70, 80] 1 0 1 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Gender", "Attrition_Flag")
Attrition_Flag 0 1 All Gender All 8500 1627 10127 F 4428 930 5358 M 4072 697 4769 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Dependent_count", "Attrition_Flag")
Attrition_Flag 0 1 All Dependent_count All 8500 1627 10127 3 2250 482 2732 2 2238 417 2655 1 1569 269 1838 4 1314 260 1574 0 769 135 904 5 360 64 424 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Education_Level", "Attrition_Flag")
Attrition_Flag 0 1 All Education_Level All 7237 1371 8608 Graduate 2641 487 3128 High School 1707 306 2013 Uneducated 1250 237 1487 College 859 154 1013 Doctorate 356 95 451 Post-Graduate 424 92 516 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Marital_Status", "Attrition_Flag")
Attrition_Flag 0 1 All Marital_Status All 7880 1498 9378 Married 3978 709 4687 Single 3275 668 3943 Divorced 627 121 748 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Income_Category", "Attrition_Flag")
Attrition_Flag 0 1 All Income_Category All 8500 1627 10127 Less than $40K 2949 612 3561 $40K - $60K 1519 271 1790 $80K - $120K 1293 242 1535 $60K - $80K 1213 189 1402 abc 925 187 1112 $120K + 601 126 727 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Card_Category", "Attrition_Flag")
Attrition_Flag 0 1 All Card_Category All 8500 1627 10127 Blue 7917 1519 9436 Silver 473 82 555 Gold 95 21 116 Platinum 15 5 20 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Months_on_bookRange", "Attrition_Flag")
Attrition_Flag 0 1 All Months_on_bookRange All 8500 1627 10127 (24, 36] 4547 871 5418 (36, 48] 2688 519 3207 (12, 24] 721 126 847 (48, 60] 544 111 655 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Total_Relationship_Count", "Attrition_Flag")
Attrition_Flag 0 1 All Total_Relationship_Count All 8500 1627 10127 3 1905 400 2305 2 897 346 1243 1 677 233 910 5 1664 227 1891 4 1687 225 1912 6 1670 196 1866 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Months_Inactive_12_mon", "Attrition_Flag")
Attrition_Flag 0 1 All Months_Inactive_12_mon All 8500 1627 10127 3 3020 826 3846 2 2777 505 3282 4 305 130 435 1 2133 100 2233 5 146 32 178 6 105 19 124 0 14 15 29 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Contacts_Count_12_mon", "Attrition_Flag")
Attrition_Flag 0 1 All Contacts_Count_12_mon All 8500 1627 10127 3 2699 681 3380 2 2824 403 3227 4 1077 315 1392 1 1391 108 1499 5 117 59 176 6 0 54 54 0 392 7 399 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Credit_LimitRange", "Attrition_Flag")
Attrition_Flag 0 1 All Credit_LimitRange All 8500 1627 10127 (1000, 5000] 4433 926 5359 (5000, 10000] 1712 302 2014 (10000, 15000] 796 145 941 (30000, 35000] 559 108 667 (15000, 20000] 479 70 549 (20000, 25000] 329 43 372 (25000, 30000] 192 33 225 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Total_Revolving_BalRange", "Attrition_Flag")
Attrition_Flag 0 1 All Total_Revolving_BalRange All 8500 1627 10127 (-1, 500] 1597 999 2596 (500, 1000] 1154 163 1317 (2500, 3000] 367 162 529 (1500, 2000] 2182 109 2291 (1000, 1500] 2156 102 2258 (2000, 2500] 1044 92 1136 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Avg_Open_To_BuyRange", "Attrition_Flag")
Attrition_Flag 0 1 All Avg_Open_To_BuyRange All 8500 1627 10127 (2, 5000] 5013 976 5989 (5000, 10000] 1369 273 1642 (10000, 15000] 686 134 820 (30000, 35000] 516 107 623 (15000, 20000] 445 64 509 (20000, 25000] 291 41 332 (25000, 30000] 180 32 212 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Total_Trans_AmtRange", "Attrition_Flag")
Attrition_Flag 0 1 All Total_Trans_AmtRange All 8500 1627 10127 (1000, 5000] 6850 1228 8078 (5000, 10000] 884 246 1130 (500, 1000] 19 143 162 (10000, 15000] 462 10 472 (15000, 20000] 285 0 285 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Total_Trans_CtRange", "Attrition_Flag")
Attrition_Flag 0 1 All Total_Trans_CtRange All 8500 1627 10127 (9, 50] 1941 1187 3128 (50, 100] 5910 440 6350 (100, 150] 649 0 649 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Total_Ct_Chng_Q4_Q1Range", "Attrition_Flag")
Attrition_Flag 0 1 All Total_Ct_Chng_Q4_Q1Range All 8500 1627 10127 (0.5, 1.0] 7098 830 7928 (-0.1, 0.5] 776 755 1531 (1.0, 1.5] 548 39 587 (2.0, 2.5] 15 2 17 (1.5, 2.0] 54 1 55 (2.5, 3.0] 5 0 5 (3.0, 3.5] 2 0 2 (3.5, 4.0] 2 0 2 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Total_Amt_Chng_Q4_Q1Range", "Attrition_Flag")
Attrition_Flag 0 1 All Total_Amt_Chng_Q4_Q1Range All 8500 1627 10127 (0.5, 1.0] 7215 1177 8392 (-0.1, 0.5] 434 308 742 (1.0, 1.5] 742 142 884 (1.5, 2.0] 89 0 89 (2.0, 2.5] 16 0 16 (2.5, 3.0] 2 0 2 (3.0, 3.5] 2 0 2 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(Customer_data, "Avg_Utilization_RatioRange", "Attrition_Flag")
Attrition_Flag 0 1 All Avg_Utilization_RatioRange All 8500 1627 10127 (-0.1, 0.1] 2984 1069 4053 (0.1, 0.2] 1172 121 1293 (0.2, 0.3] 819 88 907 (0.8, 0.9] 285 71 356 (0.3, 0.4] 588 70 658 (0.4, 0.5] 620 51 671 (0.5, 0.6] 696 47 743 (0.6, 0.7] 699 44 743 (0.7, 0.8] 550 42 592 (0.9, 1.0] 87 24 111 ------------------------------------------------------------------------------------------------------------------------
numerical_col = Customer_data.select_dtypes(include=np.number).columns.tolist()
numerical_col.remove('Attrition_Flag')
plt.figure(figsize=(20,30))
for i, variable in enumerate(numerical_col):
plt.subplot(5,4,i+1)
plt.boxplot(Customer_data[variable],whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
We will treat them all using Capping method.
def treat_outliers(data,col):
'''
treats outliers in a varaible
col: str, name of the numerical varaible
data: data frame
col: name of the column
'''
Q1=data[col].quantile(0.25) # 25th quantile
Q3=data[col].quantile(0.75) # 75th quantile
IQR=Q3-Q1
Lower_Whisker = Q1 - 1.5*IQR
Upper_Whisker = Q3 + 1.5*IQR
data[col] = np.clip(data[col], Lower_Whisker, Upper_Whisker) # all the values smaller than Lower_Whisker will be assigned value of Lower_whisker
# and all the values above upper_whisker will be assigned value of upper_Whisker
return data
def treat_outliers_all(data, col_list):
'''
treat outlier in all numerical varaibles
col_list: list of numerical varaibles
data: data frame
'''
for c in col_list:
data = treat_outliers(data,c)
return data
numerical_col = Customer_data.select_dtypes(include=np.number).columns.tolist()# getting list of numerical columns
numerical_col.remove('Attrition_Flag')
Tour_data = treat_outliers_all(Customer_data,numerical_col)
numerical_col = Customer_data.select_dtypes(include=np.number).columns.tolist()
numerical_col.remove('Attrition_Flag')
plt.figure(figsize=(20,30))
for i, variable in enumerate(numerical_col):
plt.subplot(5,4,i+1)
plt.boxplot(Customer_data[variable],whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
Customer_data['Income_Category'] = Customer_data['Income_Category'].replace('abc',np.nan)
Customer_data.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 Customer_AgeRange 0 Months_on_bookRange 0 Credit_LimitRange 0 Total_Revolving_BalRange 0 Avg_Open_To_BuyRange 0 Total_Trans_AmtRange 0 Total_Trans_CtRange 0 Total_Ct_Chng_Q4_Q1Range 0 Total_Amt_Chng_Q4_Q1Range 0 Avg_Utilization_RatioRange 0 dtype: int64
Customer_data['Income_Category'].unique()
['$60K - $80K', 'Less than $40K', '$80K - $120K', '$40K - $60K', '$120K +', NaN] Categories (5, object): ['$60K - $80K', 'Less than $40K', '$80K - $120K', '$40K - $60K', '$120K +']
Customer_data['Education_Level'].unique()
['High School', 'Graduate', 'Uneducated', NaN, 'College', 'Post-Graduate', 'Doctorate'] Categories (6, object): ['High School', 'Graduate', 'Uneducated', 'College', 'Post-Graduate', 'Doctorate']
Customer_data['Marital_Status'].unique()
['Married', 'Single', NaN, 'Divorced'] Categories (3, object): ['Married', 'Single', 'Divorced']
imputer = KNNImputer(n_neighbors=5)
# defining a list with names of columns that will be used for imputation
col_for_impute = [
"Income_Category",
"Education_Level",
"Marital_Status",
]
Customer_data[col_for_impute].head()
| Income_Category | Education_Level | Marital_Status | |
|---|---|---|---|
| 0 | $60K - $80K | High School | Married |
| 1 | Less than $40K | Graduate | Single |
| 2 | $80K - $120K | Graduate | Married |
| 3 | Less than $40K | High School | NaN |
| 4 | $60K - $80K | Uneducated | Married |
# Creating a new data copy for pipelines from original data
Customer_data1 = Customer_data.copy()
#Dropping unwanted range columns that were created for EDA purpose and one of the highly correlated column
Customer_data1.drop(['Customer_AgeRange','Months_on_bookRange','Credit_LimitRange','Total_Revolving_BalRange','Avg_Open_To_BuyRange','Total_Trans_AmtRange','Total_Trans_CtRange','Total_Ct_Chng_Q4_Q1Range','Total_Amt_Chng_Q4_Q1Range','Avg_Utilization_RatioRange','Credit_Limit'],axis=1,inplace=True)
# we need to pass numerical values for each categorical column for KNN imputation so we will label encode them
Income_Category = {"$60K - $80K": 0, "Less than $40K": 1, "$80K - $120K": 2, "$40K - $60K": 3, "$120K +":4}
Customer_data1["Income_Category"] = Customer_data1["Income_Category"].map(Income_Category)
Education_Level = {"High School": 0, "Graduate": 1, "Uneducated": 2, "College": 3, "Post-Graduate":4, "Doctorate":5}
Customer_data1["Education_Level"] = Customer_data1["Education_Level"].map(Education_Level)
Marital_Status = {"Married": 0, "Single": 1, "Divorced": 2}
Customer_data1["Marital_Status"] = Customer_data1["Marital_Status"].map(Marital_Status)
Customer_data1.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45.0 | M | 3 | 0 | 0 | 0 | Blue | 39.0 | 5 | 1.0 | 3.0 | 777 | 11914.0 | 1.201 | 1144.0 | 42 | 1.172 | 0.061 |
| 1 | 0 | 49.0 | F | 5 | 1 | 1 | 1 | Blue | 44.0 | 6 | 1.0 | 2.0 | 864 | 7392.0 | 1.201 | 1291.0 | 33 | 1.172 | 0.105 |
| 2 | 0 | 51.0 | M | 3 | 1 | 0 | 2 | Blue | 36.0 | 4 | 1.0 | 0.5 | 0 | 3418.0 | 1.201 | 1887.0 | 20 | 1.172 | 0.000 |
| 3 | 0 | 40.0 | F | 4 | 0 | NaN | 1 | Blue | 34.0 | 3 | 4.0 | 1.0 | 2517 | 796.0 | 1.201 | 1171.0 | 20 | 1.172 | 0.760 |
| 4 | 0 | 40.0 | M | 3 | 2 | 0 | 0 | Blue | 21.0 | 5 | 1.0 | 0.5 | 0 | 4716.0 | 1.201 | 816.0 | 28 | 1.172 | 0.000 |
X = Customer_data1.drop(["Attrition_Flag"],axis=1)
y = Customer_data1["Attrition_Flag"]
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 18) (2026, 18) (2026, 18)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in validation data =", X_val.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 6075 Number of rows in validation data = 2026 Number of rows in test data = 2026
# Fit and transform the train data
X_train[col_for_impute] = imputer.fit_transform(X_train[col_for_impute])
# Transform the train data
X_val[col_for_impute] = imputer.fit_transform(X_val[col_for_impute])
# Transform the test data
X_test[col_for_impute] = imputer.transform(X_test[col_for_impute])
# Checking that no column has missing values in train, validation or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_val.isna().sum())
print("-" * 30)
print(X_test.isna().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
## Function to inverse the encoding
def inverse_mapping(x, y):
inv_dict = {v: k for k, v in x.items()}
X_train[y] = np.round(X_train[y]).map(inv_dict).astype("category")
X_val[y] = np.round(X_val[y]).map(inv_dict).astype("category")
X_test[y] = np.round(X_test[y]).map(inv_dict).astype("category")
inverse_mapping(Income_Category, "Income_Category")
inverse_mapping(Education_Level, "Education_Level")
inverse_mapping(Marital_Status, "Marital_Status")
cols = X_train.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_train[i].value_counts())
print("*" * 30)
F 3193 M 2882 Name: Gender, dtype: int64 ****************************** Graduate 1854 High School 1228 Uneducated 881 College 618 Post-Graduate 312 Doctorate 254 Name: Education_Level, dtype: int64 ****************************** Married 2819 Single 2369 Divorced 430 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 2129 $40K - $60K 1059 $80K - $120K 953 $60K - $80K 831 $120K + 449 Name: Income_Category, dtype: int64 ****************************** Blue 5655 Silver 339 Gold 69 Platinum 12 Name: Card_Category, dtype: int64 ******************************
cols = X_val.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_val[i].value_counts())
print("*" * 30)
F 1095 M 931 Name: Gender, dtype: int64 ****************************** Graduate 623 High School 404 Uneducated 306 College 199 Post-Graduate 101 Doctorate 99 Name: Education_Level, dtype: int64 ****************************** Married 960 Single 770 Divorced 156 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 736 $40K - $60K 361 $80K - $120K 293 $60K - $80K 279 $120K + 136 Name: Income_Category, dtype: int64 ****************************** Blue 1905 Silver 97 Gold 21 Platinum 3 Name: Card_Category, dtype: int64 ******************************
cols = X_test.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_test[i].value_counts())
print("*" * 30)
F 1070 M 956 Name: Gender, dtype: int64 ****************************** Graduate 651 High School 381 Uneducated 300 College 196 Post-Graduate 103 Doctorate 98 Name: Education_Level, dtype: int64 ****************************** Married 908 Single 804 Divorced 162 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 696 $40K - $60K 370 $60K - $80K 292 $80K - $120K 289 $120K + 142 Name: Income_Category, dtype: int64 ****************************** Blue 1876 Silver 119 Gold 26 Platinum 5 Name: Card_Category, dtype: int64 ******************************
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 28) (2026, 28) (2026, 28)
Predicting a customer is going to leave the bank , but the customer didnot attrite/is not leaving the bank - Loss of resources Predicting a customer is not going to leave the bank but the customer attrited/ is leaving the bank - Loss of opportunity
Predicting a customer is not going to leave the bank but the customer attrited/ is leaving the bank i.e. losing on a potential customer as the customer will not be targeted by the bank for any offers/promotional calls.
Bank would want Recall to be maximized, greater the Recall lesser the chances of false negatives.
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="error")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 78.68602825745683 Random forest: 74.69126111983255 GBM: 82.16954474097331 Adaboost: 81.45107273678703 Xgboost: 85.95970695970696 dtree: 77.45892203035059 Training Performance: Bagging: 98.36065573770492 Random forest: 100.0 GBM: 88.11475409836066 Adaboost: 83.09426229508196 Xgboost: 100.0 dtree: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
bag= BaggingClassifier(random_state=1)
bag.fit(X_train, y_train)
BaggingClassifier(random_state=1)
# Calculating different metrics on train set
bagging_model_train_perf = model_performance_classification_sklearn(
bag, X_train, y_train
)
print("Training performance:")
bagging_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.996872 | 0.983607 | 0.996885 | 0.990201 |
# Calculating different metrics on validation set
bagging_model_val_perf = model_performance_classification_sklearn(bag, X_val, y_val)
print("Validation performance:")
bagging_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.95311 | 0.809816 | 0.888889 | 0.847512 |
# creating confusion matrix
confusion_matrix_sklearn(bag, X_val, y_val)
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)
RandomForestClassifier(random_state=1)
# Calculating different metrics on train set
random_forest_model_train_perf = model_performance_classification_sklearn(
rf, X_train, y_train
)
print("Training performance:")
random_forest_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
random_forest_model_val_perf = model_performance_classification_sklearn(rf, X_val, y_val)
print("Validation performance:")
random_forest_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.959033 | 0.809816 | 0.926316 | 0.864157 |
# creating confusion matrix
confusion_matrix_sklearn(rf, X_val, y_val)
gbm = GradientBoostingClassifier(random_state=1)
gbm.fit(X_train, y_train)
GradientBoostingClassifier(random_state=1)
# Calculating different metrics on train set
gradient_boost_model_train_perf = model_performance_classification_sklearn(
gbm, X_train, y_train
)
print("Training performance:")
gradient_boost_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.973169 | 0.881148 | 0.948181 | 0.913436 |
# Calculating different metrics on validation set
gradient_boost_model_val_perf = model_performance_classification_sklearn(
gbm, X_val, y_val
)
print("Validation performance:")
gradient_boost_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.968411 | 0.861963 | 0.936667 | 0.897764 |
# creating confusion matrix
confusion_matrix_sklearn(gbm, X_val, y_val)
abm = AdaBoostClassifier(random_state=1)
abm.fit(X_train, y_train)
AdaBoostClassifier(random_state=1)
# Calculating different metrics on train set
ada_boost_model_train_perf = model_performance_classification_sklearn(
abm, X_train, y_train
)
print("Training performance:")
ada_boost_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.956049 | 0.830943 | 0.88828 | 0.858655 |
# Calculating different metrics on validation set
ada_boost_model_val_perf = model_performance_classification_sklearn(
abm, X_val, y_val
)
print("Validation performance:")
ada_boost_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.959526 | 0.852761 | 0.891026 | 0.871473 |
# creating confusion matrix
confusion_matrix_sklearn(abm, X_val, y_val)
xgb = XGBClassifier(random_state=1, eval_metric="error")
xgb.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='error',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xg_boost_model_train_perf = model_performance_classification_sklearn(
xgb, X_train, y_train
)
print("Training performance:")
xg_boost_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
xg_boost_model_val_perf = model_performance_classification_sklearn(
xgb, X_val, y_val
)
print("Validation performance:")
xg_boost_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.968411 | 0.871166 | 0.928105 | 0.898734 |
# creating confusion matrix
confusion_matrix_sklearn(xgb, X_val, y_val)
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, y_train)
DecisionTreeClassifier(random_state=1)
# Calculating different metrics on train set
decision_tree_model_train_perf = model_performance_classification_sklearn(
dt, X_train, y_train
)
print("Training performance:")
decision_tree_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
decision_tree_model_val_perf = model_performance_classification_sklearn(
dt, X_val, y_val
)
print("Validation performance:")
decision_tree_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.935341 | 0.809816 | 0.792793 | 0.801214 |
# creating confusion matrix
confusion_matrix_sklearn(dt, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 28) After UpSampling, the shape of train_y: (10198,)
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging Oversampled", BaggingClassifier(random_state=1)))
models.append(("Random forest Oversampled", RandomForestClassifier(random_state=1)))
models.append(("GBM Oversampled", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost Oversampled", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost Oversampled", XGBClassifier(random_state=1, eval_metric="error")))
models.append(("dtree Oversampled", DecisionTreeClassifier(random_state=1)))
results_over = [] # Empty list to store all model's CV scores
names_over = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
results_over.append(cv_result_over)
names_over.append(name)
print("{}: {}".format(name, cv_result_over.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train_over, y_train_over)
scores = recall_score(y_train_over, model.predict(X_train_over)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging Oversampled: 96.0777001674075 Random forest Oversampled: 97.66630106409589 GBM Oversampled: 97.48977274913891 Adaboost Oversampled: 96.60730435451903 Xgboost Oversampled: 98.33302545748428 dtree Oversampled: 94.92072196886605 Training Performance: Bagging Oversampled: 99.68621298293783 Random forest Oversampled: 100.0 GBM Oversampled: 98.07805452049422 Adaboost Oversampled: 96.96018827221023 Xgboost Oversampled: 100.0 dtree Oversampled: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_over)
ax.set_xticklabels(names_over)
plt.xticks(rotation = 90)
plt.show()
The top three models in oversampled data giving the best recalls are XG Boost oversampled, Random Forest oversampled and GBM oversampled.
bag_over = BaggingClassifier(random_state=1)
# Training the over sampled logistic regression model with training set
bag_over.fit(X_train_over, y_train_over)
BaggingClassifier(random_state=1)
# Calculating different metrics on train set
bagging_over_train_perf = model_performance_classification_sklearn(
bag_over, X_train_over, y_train_over
)
print("Training performance:")
bagging_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.997647 | 0.996862 | 0.998429 | 0.997645 |
# Calculating different metrics on validation set
bagging_over_val_perf = model_performance_classification_sklearn(
bag_over, X_val, y_val
)
print("validation performance:")
bagging_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.943238 | 0.843558 | 0.811209 | 0.827068 |
# creating confusion matrix
confusion_matrix_sklearn(bag_over, X_val, y_val)
rf_over = RandomForestClassifier(random_state=1)
# Training the over sampled random forest model with training set
rf_over.fit(X_train_over, y_train_over)
RandomForestClassifier(random_state=1)
# Calculating different metrics on train set
random_forest_over_train_perf = model_performance_classification_sklearn(
rf_over, X_train_over, y_train_over
)
print("Training performance:")
random_forest_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
random_forest_over_val_perf = model_performance_classification_sklearn(
rf_over, X_val, y_val
)
print("validation performance:")
random_forest_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.953603 | 0.843558 | 0.86478 | 0.854037 |
# creating confusion matrix
confusion_matrix_sklearn(rf_over, X_val, y_val)
gbm_over = GradientBoostingClassifier(random_state=1)
# Training the over sampled random forest model with training set
gbm_over.fit(X_train_over, y_train_over)
GradientBoostingClassifier(random_state=1)
# Calculating different metrics on train set
gradient_boost_over_train_perf = model_performance_classification_sklearn(
gbm_over, X_train_over, y_train_over
)
print("Training performance:")
gradient_boost_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.980388 | 0.980781 | 0.980012 | 0.980396 |
# Calculating different metrics on validation set
gradient_boost_over_val_perf = model_performance_classification_sklearn(
gbm_over, X_val, y_val
)
print("validation performance:")
gradient_boost_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.962981 | 0.886503 | 0.883792 | 0.885145 |
# creating confusion matrix
confusion_matrix_sklearn(gbm_over, X_val, y_val)
abm_over = AdaBoostClassifier(random_state=1)
# Training the over sampled random forest model with training set
abm_over.fit(X_train_over, y_train_over)
AdaBoostClassifier(random_state=1)
# Calculating different metrics on train set
ada_boost_over_train_perf = model_performance_classification_sklearn(
abm_over, X_train_over, y_train_over
)
print("Training performance:")
ada_boost_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.967641 | 0.969602 | 0.965814 | 0.967704 |
# Calculating different metrics on validation set
ada_boost_over_val_perf = model_performance_classification_sklearn(
abm_over, X_val, y_val
)
print("validation performance:")
ada_boost_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.948667 | 0.880368 | 0.815341 | 0.846608 |
# creating confusion matrix
confusion_matrix_sklearn(abm_over, X_val, y_val)
xgb_over = XGBClassifier(random_state=1, eval_metric="error")
# Training the over sampled random forest model with training set
xgb_over.fit(X_train_over, y_train_over)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='error',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xg_boost_over_train_perf = model_performance_classification_sklearn(
xgb_over, X_train_over, y_train_over
)
print("Training performance:")
xg_boost_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
xg_boost_over_val_perf = model_performance_classification_sklearn(
xgb_over, X_val, y_val
)
print("validation performance:")
xg_boost_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.968411 | 0.895706 | 0.906832 | 0.901235 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_over, X_val, y_val)
dt_over = DecisionTreeClassifier(random_state=1)
# Training the over sampled random forest model with training set
dt_over.fit(X_train_over, y_train_over)
DecisionTreeClassifier(random_state=1)
# Calculating different metrics on train set
decision_tree_over_train_perf = model_performance_classification_sklearn(
dt_over, X_train_over, y_train_over
)
print("Training performance:")
decision_tree_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
decision_tree_over_val_perf = model_performance_classification_sklearn(
dt_over, X_val, y_val
)
print("validation performance:")
decision_tree_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.923495 | 0.788344 | 0.749271 | 0.768311 |
# creating confusion matrix
confusion_matrix_sklearn(dt_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 28) After Under Sampling, the shape of train_y: (1952,)
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging Undersampled", BaggingClassifier(random_state=1)))
models.append(("Random forest Undersampled", RandomForestClassifier(random_state=1)))
models.append(("GBM Undersampled", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost Undersampled", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost Undersampled", XGBClassifier(random_state=1, eval_metric="error")))
models.append(("dtree Undersampled", DecisionTreeClassifier(random_state=1)))
results_under = [] # Empty list to store all model's CV scores
names_under = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=model, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
results_under.append(cv_result_under)
names_under.append(name)
print("{}: {}".format(name, cv_result_under.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train_un, y_train_un)
scores = recall_score(y_train_un, model.predict(X_train_un)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging Undersampled: 91.18942961800104 Random forest Undersampled: 93.64835164835166 GBM Undersampled: 94.05756148613293 Adaboost Undersampled: 92.82993197278913 Xgboost Undersampled: 95.28833071690215 dtree Undersampled: 88.21664050235478 Training Performance: Bagging Undersampled: 99.48770491803278 Random forest Undersampled: 100.0 GBM Undersampled: 98.15573770491804 Adaboost Undersampled: 94.87704918032787 Xgboost Undersampled: 100.0 dtree Undersampled: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_under)
ax.set_xticklabels(names_under)
plt.xticks(rotation = 90)
plt.show()
bag_under = BaggingClassifier(random_state=1)
bag_under.fit(X_train_un, y_train_un)
BaggingClassifier(random_state=1)
# Calculating different metrics on train set
bagging_under_train_perf = model_performance_classification_sklearn(
bag_under, X_train_un, y_train_un
)
print("Training performance:")
bagging_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.996926 | 0.994877 | 0.998971 | 0.99692 |
# Calculating different metrics on validation set
bagging_under_val_perf = model_performance_classification_sklearn(
bag_under, X_val, y_val
)
print("Validation performance:")
bagging_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.926456 | 0.932515 | 0.705336 | 0.80317 |
# creating confusion matrix
confusion_matrix_sklearn(bag_under, X_val, y_val)
rf_under = RandomForestClassifier(random_state=1)
rf_under.fit(X_train_un, y_train_un)
RandomForestClassifier(random_state=1)
# Calculating different metrics on train set
random_forest_under_train_perf = model_performance_classification_sklearn(
rf_under, X_train_un, y_train_un
)
print("Training performance:")
random_forest_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
random_forest_under_val_perf = model_performance_classification_sklearn(
rf_under, X_val, y_val
)
print("Validation performance:")
random_forest_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.933366 | 0.929448 | 0.73012 | 0.817814 |
# creating confusion matrix
confusion_matrix_sklearn(rf_under, X_val, y_val)
gbm_under = GradientBoostingClassifier(random_state=1)
gbm_under.fit(X_train_un, y_train_un)
GradientBoostingClassifier(random_state=1)
# Calculating different metrics on train set
gradient_boost_under_train_perf = model_performance_classification_sklearn(
gbm_under, X_train_un, y_train_un
)
print("Training performance:")
gradient_boost_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.97541 | 0.981557 | 0.969636 | 0.97556 |
# Calculating different metrics on validation set
gradient_boost_under_val_perf = model_performance_classification_sklearn(
gbm_under, X_val, y_val
)
print("Validation performance:")
gradient_boost_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.936328 | 0.96319 | 0.728538 | 0.82959 |
# creating confusion matrix
confusion_matrix_sklearn(gbm_under, X_val, y_val)
abm_under = AdaBoostClassifier(random_state=1)
# Training the over sampled random forest model with training set
abm_under.fit(X_train_un, y_train_un)
AdaBoostClassifier(random_state=1)
# Calculating different metrics on train set
ada_boost_under_train_perf = model_performance_classification_sklearn(
abm_under, X_train_un, y_train_un
)
print("Training performance:")
ada_boost_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.943648 | 0.94877 | 0.939148 | 0.943935 |
# Calculating different metrics on validation set
ada_boost_under_val_perf = model_performance_classification_sklearn(
abm_under, X_val, y_val
)
print("validation performance:")
ada_boost_under_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.932379 | 0.960123 | 0.716247 | 0.820446 |
# creating confusion matrix
confusion_matrix_sklearn(abm_under, X_val, y_val)
xgb_under = XGBClassifier(random_state=1, eval_metric="error")
# Training the over sampled random forest model with training set
xgb_under.fit(X_train_un, y_train_un)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='error',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xg_boost_under_train_perf = model_performance_classification_sklearn(
xgb_under, X_train_un, y_train_un
)
print("Training performance:")
xg_boost_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
xg_boost_under_val_perf = model_performance_classification_sklearn(
xgb_under, X_val, y_val
)
print("validation performance:")
xg_boost_under_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.939289 | 0.957055 | 0.741093 | 0.835341 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_under, X_val, y_val)
dt_under = DecisionTreeClassifier(random_state=1)
# Training the over sampled random forest model with training set
dt_under.fit(X_train_un, y_train_un)
DecisionTreeClassifier(random_state=1)
# Calculating different metrics on train set
decision_tree_under_train_perf = model_performance_classification_sklearn(
dt_under, X_train_un, y_train_un
)
print("Training performance:")
decision_tree_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
decision_tree_under_val_perf = model_performance_classification_sklearn(
dt_under, X_val, y_val
)
print("validation performance:")
decision_tree_under_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.895854 | 0.907975 | 0.620545 | 0.737235 |
# creating confusion matrix
confusion_matrix_sklearn(dt_under, X_val, y_val)
## Function to calculate different metric scores of the raw model - Accuracy, Recall , Precision and F1 Score
def get_metrics_score(model,flag=True):
'''
model : classifier to predict values of X
'''
# defining an empty list to store train and test results
score_list=[]
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)
train_acc = model.score(X_train,y_train)
val_acc = model.score(X_val,y_val)
train_recall = metrics.recall_score(y_train,pred_train)
val_recall = metrics.recall_score(y_val,pred_val)
train_precision = metrics.precision_score(y_train,pred_train)
val_precision = metrics.precision_score(y_val,pred_val)
train_f1_score = metrics.f1_score(y_train,pred_train)
val_f1_score = metrics.f1_score(y_val,pred_val)
score_list.extend((train_acc,val_acc,train_recall,val_recall,train_precision,val_precision,train_f1_score,val_f1_score))
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ",model.score(X_train,y_train))
print("Accuracy on test set : ",model.score(X_val,y_val))
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_val,pred_val))
print("Precision on training set : ",metrics.precision_score(y_train,pred_train))
print("Precision on test set : ",metrics.precision_score(y_val,pred_val))
print("F1 score on training set : ",metrics.f1_score(y_train,pred_train))
print("F1 score on test set : ",metrics.f1_score(y_val,pred_val))
return score_list # returning the list with train and test scores
## Function to calculate different metric scores of the oversampled model - Accuracy, Recall , Precision and F1 Score
def get_metrics_score_over(model,flag=True):
'''
model : classifier to predict values of X
'''
# defining an empty list to store train and test results
score_list_over=[]
pred_train_over = model.predict(X_train_over)
pred_val = model.predict(X_val)
train_acc = model.score(X_train_over,y_train_over)
val_acc = model.score(X_val,y_val)
train_recall = metrics.recall_score(y_train_over,pred_train_over)
val_recall = metrics.recall_score(y_val,pred_val)
train_precision = metrics.precision_score(y_train_over,pred_train_over)
val_precision = metrics.precision_score(y_val,pred_val)
train_f1_score = metrics.f1_score(y_train_over,pred_train_over)
val_f1_score = metrics.f1_score(y_val,pred_val)
score_list_over.extend((train_acc,val_acc,train_recall,val_recall,train_precision,val_precision,train_f1_score,val_f1_score))
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ",model.score(X_train_over,y_train_over))
print("Accuracy on test set : ",model.score(X_val,y_val))
print("Recall on training set : ",metrics.recall_score(y_train_over,pred_train_over))
print("Recall on test set : ",metrics.recall_score(y_val,pred_val))
print("Precision on training set : ",metrics.precision_score(y_train_over,pred_train_over))
print("Precision on test set : ",metrics.precision_score(y_val,pred_val))
print("F1 score on training set : ",metrics.f1_score(y_train_over,pred_train_over))
print("F1 score on test set : ",metrics.f1_score(y_val,pred_val))
return score_list_over # returning the list with train and test scores
## Function to calculate different metric scores of the oversampled model - Accuracy, Recall , Precision and F1 Score
def get_metrics_score_under(model,flag=True):
'''
model : classifier to predict values of X
'''
# defining an empty list to store train and test results
score_list_under=[]
pred_train_under = model.predict(X_train_un)
pred_val = model.predict(X_val)
train_acc = model.score(X_train_un,y_train_un)
val_acc = model.score(X_val,y_val)
train_recall = metrics.recall_score(y_train_un,pred_train_under)
val_recall = metrics.recall_score(y_val,pred_val)
train_precision = metrics.precision_score(y_train_un,pred_train_under)
val_precision = metrics.precision_score(y_val,pred_val)
train_f1_score = metrics.f1_score(y_train_un,pred_train_under)
val_f1_score = metrics.f1_score(y_val,pred_val)
score_list_under.extend((train_acc,val_acc,train_recall,val_recall,train_precision,val_precision,train_f1_score,val_f1_score))
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ",model.score(X_train_un,y_train_un))
print("Accuracy on val set : ",model.score(X_val,y_val))
print("Recall on training set : ",metrics.recall_score(y_train_un,pred_train_under))
print("Recall on val set : ",metrics.recall_score(y_val,pred_val))
print("Precision on training set : ",metrics.precision_score(y_train_un,pred_train_under))
print("Precision on val set : ",metrics.precision_score(y_val,pred_val))
print("F1 score on training set : ",metrics.f1_score(y_train_un,pred_train_under))
print("F1 score on val set : ",metrics.f1_score(y_val,pred_val))
return score_list_under # returning the list with train and test scores
# defining list of models for raw data
models = [bag,rf,gbm,abm,xgb,dt]
# defining empty lists to add train and test results
acc_train = []
acc_val = []
recall_train = []
recall_val = []
precision_train = []
precision_val = []
f1_score_train = []
f1_score_val = []
# defining list of models for oversampled data
models1 = [bag_over,rf_over,gbm_over,abm_over,xgb_over,dt_over]
# defining empty lists to add train and test results
acc_train = []
acc_val = []
recall_train = []
recall_val = []
precision_train = []
precision_val = []
f1_score_train = []
f1_score_val = []
# defining list of models for undersampled data
models2 = [bag_under,rf_under,gbm_under,abm_under,xgb_under,dt_under]
# defining empty lists to add train and test results
acc_train = []
acc_val = []
recall_train = []
recall_val = []
precision_train = []
precision_val = []
f1_score_train = []
f1_score_val = []
# looping through all the models to get the accuracy, precall and precision scores
# Raw models
for model in models:
j = get_metrics_score(model,False)
acc_train.append(np.round(j[0],2))
acc_val.append(np.round(j[1],2))
recall_train.append(np.round(j[2],2))
recall_val.append(np.round(j[3],2))
precision_train.append(np.round(j[4],2))
precision_val.append(np.round(j[5],2))
f1_score_train.append(np.round(j[6],2))
f1_score_val.append(np.round(j[7],2))
# Oversampled models
for model in models1:
j = get_metrics_score_over(model,False)
acc_train.append(np.round(j[0],2))
acc_val.append(np.round(j[1],2))
recall_train.append(np.round(j[2],2))
recall_val.append(np.round(j[3],2))
precision_train.append(np.round(j[4],2))
precision_val.append(np.round(j[5],2))
f1_score_train.append(np.round(j[6],2))
f1_score_val.append(np.round(j[7],2))
# Undersampled models
for model in models2:
j = get_metrics_score_under(model,False)
acc_train.append(np.round(j[0],2))
acc_val.append(np.round(j[1],2))
recall_train.append(np.round(j[2],2))
recall_val.append(np.round(j[3],2))
precision_train.append(np.round(j[4],2))
precision_val.append(np.round(j[5],2))
f1_score_train.append(np.round(j[6],2))
f1_score_val.append(np.round(j[7],2))
comparison_frame = pd.DataFrame({'Model':['Bagging','Random Forest','Gradient Boost','Ada Boost','XG Boost','Decision Tree','Bagging Oversampled','Random Forest Oversampled','Gradient Boost Oversampled','Ada Boost Oversampled','XG Boost Oversampled','Decision Tree Oversampled','Bagging Undersampled','Random Forest Undersampled','Gradient Boost Undersampled','Ada Boost Undersampled','XG Boost Undersampled','Decision Tree Undersampled'],
'Train_Accuracy': acc_train,'Val_Accuracy': acc_val,
'Train_Recall':recall_train,'Val_Recall':recall_val,
'Train_Precision':precision_train,'Val_Precision':precision_val,
'Train_f1score':f1_score_train,'Val_f1score':f1_score_val})
comparison_frame
| Model | Train_Accuracy | Val_Accuracy | Train_Recall | Val_Recall | Train_Precision | Val_Precision | Train_f1score | Val_f1score | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Bagging | 1.00 | 0.95 | 0.98 | 0.81 | 1.00 | 0.89 | 0.99 | 0.85 |
| 1 | Random Forest | 1.00 | 0.96 | 1.00 | 0.81 | 1.00 | 0.93 | 1.00 | 0.86 |
| 2 | Gradient Boost | 0.97 | 0.97 | 0.88 | 0.86 | 0.95 | 0.94 | 0.91 | 0.90 |
| 3 | Ada Boost | 0.96 | 0.96 | 0.83 | 0.85 | 0.89 | 0.89 | 0.86 | 0.87 |
| 4 | XG Boost | 1.00 | 0.97 | 1.00 | 0.87 | 1.00 | 0.93 | 1.00 | 0.90 |
| 5 | Decision Tree | 1.00 | 0.94 | 1.00 | 0.81 | 1.00 | 0.79 | 1.00 | 0.80 |
| 6 | Bagging Oversampled | 1.00 | 0.94 | 1.00 | 0.84 | 1.00 | 0.81 | 1.00 | 0.83 |
| 7 | Random Forest Oversampled | 1.00 | 0.95 | 1.00 | 0.84 | 1.00 | 0.86 | 1.00 | 0.85 |
| 8 | Gradient Boost Oversampled | 0.98 | 0.96 | 0.98 | 0.89 | 0.98 | 0.88 | 0.98 | 0.89 |
| 9 | Ada Boost Oversampled | 0.97 | 0.95 | 0.97 | 0.88 | 0.97 | 0.82 | 0.97 | 0.85 |
| 10 | XG Boost Oversampled | 1.00 | 0.97 | 1.00 | 0.90 | 1.00 | 0.91 | 1.00 | 0.90 |
| 11 | Decision Tree Oversampled | 1.00 | 0.92 | 1.00 | 0.79 | 1.00 | 0.75 | 1.00 | 0.77 |
| 12 | Bagging Undersampled | 1.00 | 0.93 | 0.99 | 0.93 | 1.00 | 0.71 | 1.00 | 0.80 |
| 13 | Random Forest Undersampled | 1.00 | 0.93 | 1.00 | 0.93 | 1.00 | 0.73 | 1.00 | 0.82 |
| 14 | Gradient Boost Undersampled | 0.98 | 0.94 | 0.98 | 0.96 | 0.97 | 0.73 | 0.98 | 0.83 |
| 15 | Ada Boost Undersampled | 0.94 | 0.93 | 0.95 | 0.96 | 0.94 | 0.72 | 0.94 | 0.82 |
| 16 | XG Boost Undersampled | 1.00 | 0.94 | 1.00 | 0.96 | 1.00 | 0.74 | 1.00 | 0.84 |
| 17 | Decision Tree Undersampled | 1.00 | 0.90 | 1.00 | 0.91 | 1.00 | 0.62 | 1.00 | 0.74 |
The top three models giving the best recall score in validation are Gradient Boost Undersampled, Ada Boost and XG Boost Undersampled.
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in RandomSearchCV
param_random_abm = {
"n_estimators": np.arange(5,225,5),
"learning_rate":[0.1,0.2,0.5,1,0.05]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv_abm = RandomizedSearchCV(estimator=model, param_distributions=param_random_abm, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv_abm.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(randomized_cv_abm.best_params_,randomized_cv_abm.best_score_))
Best parameters are {'n_estimators': 175, 'learning_rate': 0.2} with CV score=0.9395656724228154:
Wall time: 37.5 s
# building model with best parameters
abm_under_tuned = AdaBoostClassifier(
n_estimators=205,
learning_rate=1,
random_state=1,
)
# Fit the model on training data
abm_under_tuned.fit(X_train_un, y_train_un)
AdaBoostClassifier(learning_rate=1, n_estimators=205, random_state=1)
# Calculating different metrics on train set
AdaBoost_random_train = model_performance_classification_sklearn(
abm_under_tuned, X_train_un, y_train_un
)
print("Training performance:")
AdaBoost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.96875 | 0.976434 | 0.961655 | 0.968988 |
# Calculating different metrics on validation set
AdaBoost_random_val = model_performance_classification_sklearn(abm_under_tuned, X_val, y_val)
print("Validation performance:")
AdaBoost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.929911 | 0.947853 | 0.711982 | 0.813158 |
# creating confusion matrix
confusion_matrix_sklearn(abm_under_tuned, X_val, y_val)
%%time
# defining model
model = XGBClassifier(random_state=1,eval_metric='error')
# Parameter grid to pass in RandomizedSearchCV
param_random={'n_estimators':np.arange(10,100,5),
'learning_rate':[0.5,0.05,0.01,0.1,1],
'gamma':[1,3,5,7,9],
'subsample':[0.5,0.6],
'max_depth':[1,2,3],
'reg_lambda':[5,10,15,20],
'colsample_bytree':[0.5,0.7,0.8],
'colsample_bylevel':[0.5,0.7,0.8]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_under_tuned = RandomizedSearchCV(estimator=model, param_distributions=param_random, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
xgb_under_tuned.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(xgb_under_tuned.best_params_,xgb_under_tuned.best_score_))
Best parameters are {'subsample': 0.6, 'reg_lambda': 20, 'n_estimators': 95, 'max_depth': 3, 'learning_rate': 0.5, 'gamma': 1, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.5} with CV score=0.9508006279434852:
Wall time: 8.81 s
# building model with best parameters
xgb_under_tuned = XGBClassifier(
random_state=1,
subsample=0.6,
n_estimators=95,
max_depth=3,
reg_lambda=20,
learning_rate=0.5,
colsample_bytree=0.5,
colsample_bylevel=0.5,
gamma=1,
eval_metric="error",
)
# Fit the model on training data
xgb_under_tuned.fit(X_train_un, y_train_un)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
colsample_bynode=1, colsample_bytree=0.5, eval_metric='error',
gamma=1, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.5, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=95, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=20,
scale_pos_weight=1, subsample=0.6, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
xgb_under_tuned, X_train_un, y_train_un
)
print("XG Boost Tuned Training performance:")
xgboost_random_train
XG Boost Tuned Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.971311 | 0.976434 | 0.966531 | 0.971458 |
# Calculating different metrics on validation set
xgboost_random_val = model_performance_classification_sklearn(xgb_under_tuned, X_val, y_val)
print("XG Boost Tuned Validation performance:")
xgboost_random_val
XG Boost Tuned Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.939289 | 0.95092 | 0.743405 | 0.834455 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_under_tuned, X_val, y_val)
%%time
# defining model
model = GradientBoostingClassifier(init = 'zero',random_state=1)
# Parameter grid to pass in GridSearchCV
param_random_gbm = {'n_estimators': np.arange(10,110,5),
'subsample':[0.7,0.8,0.6,0.5],
"learning_rate": [1, 0.02, 0.05,0.1],
'max_features':[0.7,0.6,0.5,0.9],
'max_depth':[3,1,2],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv_gbm = RandomizedSearchCV(estimator=model, param_distributions=param_random_gbm, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv_gbm.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(randomized_cv_gbm.best_params_,randomized_cv_gbm.best_score_))
Best parameters are {'subsample': 0.7, 'n_estimators': 105, 'max_features': 0.9, 'max_depth': 2, 'learning_rate': 0.1} with CV score=0.9446624803767663:
Wall time: 11.4 s
# building model with best parameters
gbm_under_tuned = GradientBoostingClassifier(
n_estimators=105,
learning_rate=0.1,
random_state=1,
max_depth=2,
subsample=0.7,
max_features=0.9,
)
# Fit the model on training data
gbm_under_tuned.fit(X_train_un, y_train_un)
GradientBoostingClassifier(max_depth=2, max_features=0.9, n_estimators=105,
random_state=1, subsample=0.7)
# Calculating different metrics on train set
Gradientboost_random_train = model_performance_classification_sklearn(
gbm_under_tuned, X_train_un, y_train_un
)
print("Training performance:")
Gradientboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.954918 | 0.963115 | 0.947581 | 0.955285 |
# Calculating different metrics on validation set
Gradientboost_random_val = model_performance_classification_sklearn(gbm_under_tuned, X_val, y_val)
print("Validation performance:")
Gradientboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.928924 | 0.95092 | 0.707763 | 0.811518 |
# creating confusion matrix
confusion_matrix_sklearn(gbm_under_tuned, X_val, y_val)
%%time
# defining model
model = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1)
# Parameter grid to pass in GridSearchCV
param_random_gbm1 = {
'n_estimators': np.arange(10,110,5),
'subsample':[0.7,0.8,0.6,0.5],
"learning_rate": [1,0.02,0.05,0.1],
'max_features':[0.7,0.6,0.5,0.9],
'max_depth':[3,1,2],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv_gbm1 = RandomizedSearchCV(estimator=model, param_distributions=param_random_gbm1, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv_gbm1.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(randomized_cv_gbm1.best_params_,randomized_cv_gbm1.best_score_))
Best parameters are {'subsample': 0.5, 'n_estimators': 105, 'max_features': 0.6, 'max_depth': 3, 'learning_rate': 0.05} with CV score=0.9436525379382523:
Wall time: 28.8 s
# building model with best parameters
gbm_under_tuned1 = GradientBoostingClassifier(
n_estimators=105,
subsample=0.5,
random_state=1,
max_depth=3,
learning_rate=0.05,
max_features=0.6,
)
# Fit the model on training data
gbm_under_tuned1.fit(X_train_un, y_train_un)
GradientBoostingClassifier(learning_rate=0.05, max_features=0.6,
n_estimators=105, random_state=1, subsample=0.5)
# Calculating different metrics on train set
Gradientboost_random_train1 = model_performance_classification_sklearn(
gbm_under_tuned1, X_train_un, y_train_un
)
print("Training performance:")
Gradientboost_random_train1
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.95748 | 0.968238 | 0.947844 | 0.957932 |
# Calculating different metrics on validation set
Gradientboost_random_val1 = model_performance_classification_sklearn(gbm_under_tuned1, X_val, y_val)
print("Validation performance:")
Gradientboost_random_val1
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.927937 | 0.935583 | 0.709302 | 0.806878 |
# creating confusion matrix
confusion_matrix_sklearn(gbm_under_tuned1, X_val, y_val)
# defining list of models for raw data
models_tuned = [abm_under_tuned,xgb_under_tuned,gbm_under_tuned,gbm_under_tuned1]
# defining empty lists to add train and test results
acc_train = []
acc_val = []
recall_train = []
recall_val = []
precision_train = []
precision_val = []
f1_score_train = []
f1_score_val = []
# looping through all the models to get the accuracy, precall and precision scores
# Tuned models
for model in models_tuned:
j = get_metrics_score_under(model,False)
acc_train.append(np.round(j[0],2))
acc_val.append(np.round(j[1],2))
recall_train.append(np.round(j[2],2))
recall_val.append(np.round(j[3],2))
precision_train.append(np.round(j[4],2))
precision_val.append(np.round(j[5],2))
f1_score_train.append(np.round(j[6],2))
f1_score_val.append(np.round(j[7],2))
comparison_frame_tuned = pd.DataFrame({'Model':['Ada Boost Undersampled Tuned','XG Boost Undersampled Tuned','Gradient Boost Undersampled Tuned with Init Zero','Gradient Boost Undersampled Tuned with Init AdaBoost'],
'Train_Accuracy': acc_train,'Val_Accuracy': acc_val,
'Train_Recall':recall_train,'Val_Recall':recall_val,
'Train_Precision':precision_train,'Val_Precision':precision_val,
'Train_f1score':f1_score_train,'Val_f1score':f1_score_val})
comparison_frame_tuned
| Model | Train_Accuracy | Val_Accuracy | Train_Recall | Val_Recall | Train_Precision | Val_Precision | Train_f1score | Val_f1score | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Ada Boost Undersampled Tuned | 0.97 | 0.93 | 0.98 | 0.95 | 0.96 | 0.71 | 0.97 | 0.81 |
| 1 | XG Boost Undersampled Tuned | 0.97 | 0.94 | 0.98 | 0.95 | 0.97 | 0.74 | 0.97 | 0.83 |
| 2 | Gradient Boost Undersampled Tuned with Init Zero | 0.95 | 0.93 | 0.96 | 0.95 | 0.95 | 0.71 | 0.96 | 0.81 |
| 3 | Gradient Boost Undersampled Tuned with Init Ad... | 0.96 | 0.93 | 0.97 | 0.94 | 0.95 | 0.71 | 0.96 | 0.81 |
# Calculating different metrics on the test set
xgboost_tuned_test = model_performance_classification_sklearn(xgb_under_tuned, X_test, y_test)
print(" XG Boost Tuned Test performance:")
xgboost_tuned_test
XG Boost Tuned Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.938796 | 0.978462 | 0.731034 | 0.836842 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_under_tuned, X_test, y_test)
# Best Model performance comparison
best_model_df = pd.concat(
[
xgboost_random_train.T,
xgboost_random_val.T,
xgboost_tuned_test.T,
],
axis=1,
)
best_model_df.columns = [
"XGBoost Tuned Train",
"XGBoost Tuned Validation",
"XGBoost Tuned Test",
]
print("XGBoost Performance Comparison:")
best_model_df
XGBoost Performance Comparison:
| XGBoost Tuned Train | XGBoost Tuned Validation | XGBoost Tuned Test | |
|---|---|---|---|
| Accuracy | 0.971311 | 0.939289 | 0.938796 |
| Recall | 0.976434 | 0.950920 | 0.978462 |
| Precision | 0.966531 | 0.743405 | 0.731034 |
| F1 | 0.971458 | 0.834455 | 0.836842 |
importances = xgb_under_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)),np.array(X_train_un.columns)[indices])
#plt.yticklabels(np.array(X_train_un.columns)[indices])
plt.xlabel('Relative Importance')
plt.show()
# Creating a new data copy for pipelines from original data
Customer_data2 = Customer_data.copy()
#Dropping unwanted range columns that were created for EDA purpose and one of highly correlated columns
Customer_data2.drop(['Customer_AgeRange','Months_on_bookRange','Credit_LimitRange','Total_Revolving_BalRange','Avg_Open_To_BuyRange','Total_Trans_AmtRange','Total_Trans_CtRange','Total_Ct_Chng_Q4_Q1Range','Total_Amt_Chng_Q4_Q1Range','Avg_Utilization_RatioRange','Credit_Limit'],axis=1,inplace=True)
Customer_data2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null float64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 9015 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null float64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null float64 11 Contacts_Count_12_mon 10127 non-null float64 12 Total_Revolving_Bal 10127 non-null int64 13 Avg_Open_To_Buy 10127 non-null float64 14 Total_Amt_Chng_Q4_Q1 10127 non-null float64 15 Total_Trans_Amt 10127 non-null float64 16 Total_Trans_Ct 10127 non-null int64 17 Total_Ct_Chng_Q4_Q1 10127 non-null float64 18 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(5), float64(9), int64(5) memory usage: 1.1 MB
# creating a list of numerical variables
numerical_features = [
"Customer_Age",
"Dependent_count",
"Months_on_book",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Total_Revolving_Bal",
"Avg_Open_To_Buy",
"Total_Trans_Amt",
"Total_Trans_Ct",
"Avg_Utilization_Ratio",
]
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# creating a list of categorical variables
categorical_features = ["Education_Level", "Marital_Status","Income_Category","Card_Category","Gender"]
# creating a transformer for categorical variables, which will first apply simple imputer and
#then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Separating target variable and other variables
X = Customer_data2.drop(columns="Attrition_Flag")
Y = Customer_data2["Attrition_Flag"]
# Splitting the data into train and test sets
X_train_pip, X_test_pip, y_train_pip, y_test_pip = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train_pip.shape, X_test_pip.shape)
(7088, 18) (3039, 18)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("pre", preprocessor),
(
"XGB",
XGBClassifier(
random_state=1,
subsample=0.6,
n_estimators=95,
max_depth=3,
reg_lambda=20,
learning_rate=0.5,
colsample_bytree=0.5,
colsample_bylevel=0.5,
gamma=1,
eval_metric="error",
),
),
]
)
# Fit the model on training data
model.fit(X_train_pip, y_train_pip)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
['Customer_Age',
'Dependent_count',
'Months_on_book',
'Total_Relationship_Count',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon',
'Total_Revolving_Bal',
'Avg_Open_To_Buy',
'Total_Trans_Amt',
'Total_Trans_Ct'...
gamma=1, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.5,
max_delta_step=0, max_depth=3,
min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=95,
n_jobs=4, num_parallel_tree=1, random_state=1,
reg_alpha=0, reg_lambda=20, scale_pos_weight=1,
subsample=0.6, tree_method='exact',
validate_parameters=1, verbosity=None))])